import numpy as np
import pandas as pd
#data visualization
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
# Ignore Warnings
import warnings
warnings.filterwarnings('ignore')
#see max columns
pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')
plt.rcParams['figure.dpi']=100
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
# --- Reading Train Dataset ---
train.head(10).style.background_gradient(cmap='Blues').set_properties(**{'font-family': 'Segoe UI'})
| UniqueID | disbursed_amount | asset_cost | ltv | branch_id | supplier_id | manufacturer_id | Current_pincode_ID | Date.of.Birth | Employment.Type | DisbursalDate | State_ID | Employee_code_ID | MobileNo_Avl_Flag | Aadhar_flag | PAN_flag | VoterID_flag | Driving_flag | Passport_flag | PERFORM_CNS.SCORE | PERFORM_CNS.SCORE.DESCRIPTION | PRI.NO.OF.ACCTS | PRI.ACTIVE.ACCTS | PRI.OVERDUE.ACCTS | PRI.CURRENT.BALANCE | PRI.SANCTIONED.AMOUNT | PRI.DISBURSED.AMOUNT | SEC.NO.OF.ACCTS | SEC.ACTIVE.ACCTS | SEC.OVERDUE.ACCTS | SEC.CURRENT.BALANCE | SEC.SANCTIONED.AMOUNT | SEC.DISBURSED.AMOUNT | PRIMARY.INSTAL.AMT | SEC.INSTAL.AMT | NEW.ACCTS.IN.LAST.SIX.MONTHS | DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS | AVERAGE.ACCT.AGE | CREDIT.HISTORY.LENGTH | NO.OF_INQUIRIES | loan_default | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 420825.000000 | 50578.000000 | 58400.000000 | 89.550000 | 67.000000 | 22807.000000 | 45.000000 | 1441.000000 | 1/1/1984 | Salaried | 3/8/2018 | 6.000000 | 1998.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | No Bureau History Available | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0yrs 0mon | 0yrs 0mon | 0.000000 | 0.000000 |
| 1 | 537409.000000 | 47145.000000 | 65550.000000 | 73.230000 | 67.000000 | 22807.000000 | 45.000000 | 1502.000000 | 31-07-85 | Self employed | 26-09-18 | 6.000000 | 1998.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 598.000000 | I-Medium Risk | 1.000000 | 1.000000 | 1.000000 | 27600.000000 | 50200.000000 | 50200.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1991.000000 | 0.000000 | 0.000000 | 1.000000 | 1yrs 11mon | 1yrs 11mon | 0.000000 | 1.000000 |
| 2 | 417566.000000 | 53278.000000 | 61360.000000 | 89.630000 | 67.000000 | 22807.000000 | 45.000000 | 1497.000000 | 24-08-85 | Self employed | 1/8/2018 | 6.000000 | 1998.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | No Bureau History Available | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0yrs 0mon | 0yrs 0mon | 0.000000 | 0.000000 |
| 3 | 624493.000000 | 57513.000000 | 66113.000000 | 88.480000 | 67.000000 | 22807.000000 | 45.000000 | 1501.000000 | 30-12-93 | Self employed | 26-10-18 | 6.000000 | 1998.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 305.000000 | L-Very High Risk | 3.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 31.000000 | 0.000000 | 0.000000 | 0.000000 | 0yrs 8mon | 1yrs 3mon | 1.000000 | 1.000000 |
| 4 | 539055.000000 | 52378.000000 | 60300.000000 | 88.390000 | 67.000000 | 22807.000000 | 45.000000 | 1495.000000 | 9/12/1977 | Self employed | 26-09-18 | 6.000000 | 1998.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | No Bureau History Available | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0yrs 0mon | 0yrs 0mon | 1.000000 | 1.000000 |
| 5 | 518279.000000 | 54513.000000 | 61900.000000 | 89.660000 | 67.000000 | 22807.000000 | 45.000000 | 1501.000000 | 8/9/1990 | Self employed | 19-09-18 | 6.000000 | 1998.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 825.000000 | A-Very Low Risk | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1347.000000 | 0.000000 | 0.000000 | 0.000000 | 1yrs 9mon | 2yrs 0mon | 0.000000 | 0.000000 |
| 6 | 529269.000000 | 46349.000000 | 61500.000000 | 76.420000 | 67.000000 | 22807.000000 | 45.000000 | 1502.000000 | 1/6/1988 | Salaried | 23-09-18 | 6.000000 | 1998.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | No Bureau History Available | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0yrs 0mon | 0yrs 0mon | 0.000000 | 0.000000 |
| 7 | 510278.000000 | 43894.000000 | 61900.000000 | 71.890000 | 67.000000 | 22807.000000 | 45.000000 | 1501.000000 | 4/10/1989 | Salaried | 16-09-18 | 6.000000 | 1998.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 17.000000 | Not Scored: Not Enough Info available on the customer | 1.000000 | 1.000000 | 0.000000 | 72879.000000 | 74500.000000 | 74500.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0yrs 2mon | 0yrs 2mon | 0.000000 | 0.000000 |
| 8 | 490213.000000 | 53713.000000 | 61973.000000 | 89.560000 | 67.000000 | 22807.000000 | 45.000000 | 1497.000000 | 15-11-91 | Self employed | 5/9/2018 | 6.000000 | 1998.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 718.000000 | D-Very Low Risk | 1.000000 | 1.000000 | 0.000000 | -41.000000 | 365384.000000 | 365384.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 4yrs 8mon | 4yrs 8mon | 1.000000 | 0.000000 |
| 9 | 510980.000000 | 52603.000000 | 61300.000000 | 86.950000 | 67.000000 | 22807.000000 | 45.000000 | 1492.000000 | 1/6/1968 | Salaried | 16-09-18 | 6.000000 | 1998.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 818.000000 | A-Very Low Risk | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2608.000000 | 0.000000 | 0.000000 | 0.000000 | 1yrs 7mon | 1yrs 7mon | 0.000000 | 0.000000 |
#reading data dict
data_dic = pd.read_excel('data_dict.xlsx')
#quick eyeballing data
any_5_sample = pd.DataFrame(train.sample(5).T).reset_index()
#renaming first column from index to Variable name
any_5_sample.rename(columns = {'index':'Variable Name'},inplace=True)
# merging data dictionary with features to make a solid understanding
sample_with_discription = data_dic[['Variable Name','Description']].merge(any_5_sample,how='left',on='Variable Name')
sample_with_discription.style.set_properties(**{'font-family': 'Segoe UI'})
| Variable Name | Description | 43080 | 81983 | 90612 | 60308 | 4710 | |
|---|---|---|---|---|---|---|---|
| 0 | UniqueID | Identifier for customers | 544615.000000 | 587502.000000 | 517543.000000 | 471741.000000 | 517243.000000 |
| 1 | loan_default | Payment default in the first EMI on due date | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 2 | disbursed_amount | Amount of Loan disbursed | 55513.000000 | 45349.000000 | 83238.000000 | 44309.000000 | 56513.000000 |
| 3 | asset_cost | Cost of the Asset | 72000.000000 | 63090.000000 | 131238.000000 | 57182.000000 | 76862.000000 |
| 4 | ltv | Loan to Value of the asset | 79.170000 | 74.500000 | 64.960000 | 83.940000 | 76.110000 |
| 5 | branch_id | Branch where the loan was disbursed | 248.000000 | 130.000000 | 65.000000 | 74.000000 | 65.000000 |
| 6 | supplier_id | Vehicle Dealer where the loan was disbursed | 23970.000000 | 15235.000000 | 23039.000000 | 16846.000000 | 21617.000000 |
| 7 | manufacturer_id | Vehicle manufacturer(Hero, Honda, TVS etc.) | 86.000000 | 51.000000 | 67.000000 | 45.000000 | 86.000000 |
| 8 | Current_pincode | Current pincode of the customer | nan | nan | nan | nan | nan |
| 9 | Date.of.Birth | Date of birth of the customer | 24-11-93 | 8/10/1992 | 15-02-94 | 5/5/1975 | 1/3/1980 |
| 10 | Employment.Type | Employment Type of the customer (Salaried/Self Employed) | Salaried | Self employed | Self employed | Salaried | Salaried |
| 11 | DisbursalDate | Date of disbursement | 28-09-18 | 19-10-18 | 19-09-18 | 29-08-18 | 19-09-18 |
| 12 | State_ID | State of disbursement | 4.000000 | 6.000000 | 13.000000 | 4.000000 | 13.000000 |
| 13 | Employee_code_ID | Employee of the organization who logged the disbursement | 1662.000000 | 510.000000 | 199.000000 | 414.000000 | 1799.000000 |
| 14 | MobileNo_Avl_Flag | if Mobile no. was shared by the customer then flagged as 1 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
| 15 | Aadhar_flag | if aadhar was shared by the customer then flagged as 1 | 1.000000 | 0.000000 | 1.000000 | 1.000000 | 0.000000 |
| 16 | PAN_flag | if pan was shared by the customer then flagged as 1 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 |
| 17 | VoterID_flag | if voter was shared by the customer then flagged as 1 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 |
| 18 | Driving_flag | if DL was shared by the customer then flagged as 1 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 19 | Passport_flag | if passport was shared by the customer then flagged as 1 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 20 | PERFORM_CNS.SCORE | Bureau Score | 737.000000 | 624.000000 | 0.000000 | 539.000000 | 825.000000 |
| 21 | PERFORM_CNS.SCORE.DESCRIPTION | Bureau score description | C-Very Low Risk | H-Medium Risk | No Bureau History Available | J-High Risk | A-Very Low Risk |
| 22 | PRI.NO.OF.ACCTS | count of total loans taken by the customer at the time of disbursement | 1.000000 | 5.000000 | 0.000000 | 1.000000 | 1.000000 |
| 23 | PRI.ACTIVE.ACCTS | count of active loans taken by the customer at the time of disbursement | 1.000000 | 3.000000 | 0.000000 | 1.000000 | 0.000000 |
| 24 | PRI.OVERDUE.ACCTS | count of default accounts at the time of disbursement | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 |
| 25 | PRI.CURRENT.BALANCE | total Principal outstanding amount of the active loans at the time of disbursement | 150631.000000 | 6590.000000 | 0.000000 | 105741.000000 | 0.000000 |
| 26 | PRI.SANCTIONED.AMOUNT | total amount that was sanctioned for all the loans at the time of disbursement | 176249.000000 | 31560.000000 | 0.000000 | 0.000000 | 0.000000 |
| 27 | PRI.DISBURSED.AMOUNT | total amount that was disbursed for all the loans at the time of disbursement | 176249.000000 | 31560.000000 | 0.000000 | 0.000000 | 0.000000 |
| 28 | SEC.NO.OF.ACCTS | count of total loans taken by the customer at the time of disbursement | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 29 | SEC.ACTIVE.ACCTS | count of active loans taken by the customer at the time of disbursement | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 30 | SEC.OVERDUE.ACCTS | count of default accounts at the time of disbursement | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 31 | SEC.CURRENT.BALANCE | total Principal outstanding amount of the active loans at the time of disbursement | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 32 | SEC.SANCTIONED.AMOUNT | total amount that was sanctioned for all the loans at the time of disbursement | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 33 | SEC.DISBURSED.AMOUNT | total amount that was disbursed for all the loans at the time of disbursement | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 34 | PRIMARY.INSTAL.AMT | EMI Amount of the primary loan | 4302.000000 | 1584.000000 | 0.000000 | 0.000000 | 1365.000000 |
| 35 | SEC.INSTAL.AMT | EMI Amount of the secondary loan | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 36 | NEW.ACCTS.IN.LAST.SIX.MONTHS | New loans taken by the customer in last 6 months before the disbursment | 0.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 |
| 37 | DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS | Loans defaulted in the last 6 months | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 |
| 38 | AVERAGE.ACCT.AGE | Average loan tenure | 0yrs 9mon | 0yrs 7mon | 0yrs 0mon | 3yrs 11mon | 1yrs 6mon |
| 39 | CREDIT.HISTORY.LENGTH | Time since first loan | 0yrs 9mon | 1yrs 2mon | 0yrs 0mon | 3yrs 11mon | 1yrs 6mon |
| 40 | NO.OF_INQUIRIES | Enquries done by the customer for loans | 0.000000 | 3.000000 | 0.000000 | 0.000000 | 0.000000 |
LTV -- Loan to Value of asset || idealy formula becomes || sometime other charges can be added to it $$LTV = \frac{disbursedAmount}{assetCost} $$
def dataset_shape_info(dataset):
# --- Print Dataset Info ---
print('\033[36m\033[1m'+'.: Dataset Info :.')
print('\033[0m\033[36m*' * 20)
print('\033[0m'+'Total Rows:'+'\033[36m\033[1m', dataset.shape[0])
print('\033[0m'+'Total Columns:'+'\033[36m\033[1m', dataset.shape[1])
print('\033[0m\033[36m*' * 20)
print('\n')
# --- Print Dataset Detail ---
print('\033[1m'+'.: Dataset Details :.')
print('\033[0m\033[36m*' * 22 +'\033[0m')
dataset.info(memory_usage = False)
dataset_shape_info(train)
.: Dataset Info :. ******************** Total Rows: 130393 Total Columns: 41 ******************** .: Dataset Details :. ********************** <class 'pandas.core.frame.DataFrame'> RangeIndex: 130393 entries, 0 to 130392 Data columns (total 41 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 UniqueID 129999 non-null float64 1 disbursed_amount 129999 non-null float64 2 asset_cost 129999 non-null float64 3 ltv 129999 non-null float64 4 branch_id 129999 non-null float64 5 supplier_id 129999 non-null float64 6 manufacturer_id 129999 non-null float64 7 Current_pincode_ID 129999 non-null float64 8 Date.of.Birth 129999 non-null object 9 Employment.Type 125463 non-null object 10 DisbursalDate 129999 non-null object 11 State_ID 129999 non-null float64 12 Employee_code_ID 129999 non-null float64 13 MobileNo_Avl_Flag 129999 non-null float64 14 Aadhar_flag 129999 non-null float64 15 PAN_flag 129999 non-null float64 16 VoterID_flag 129999 non-null float64 17 Driving_flag 129999 non-null float64 18 Passport_flag 129999 non-null float64 19 PERFORM_CNS.SCORE 129999 non-null float64 20 PERFORM_CNS.SCORE.DESCRIPTION 129999 non-null object 21 PRI.NO.OF.ACCTS 129999 non-null float64 22 PRI.ACTIVE.ACCTS 129999 non-null float64 23 PRI.OVERDUE.ACCTS 129999 non-null float64 24 PRI.CURRENT.BALANCE 129999 non-null float64 25 PRI.SANCTIONED.AMOUNT 129999 non-null float64 26 PRI.DISBURSED.AMOUNT 129999 non-null float64 27 SEC.NO.OF.ACCTS 129999 non-null float64 28 SEC.ACTIVE.ACCTS 129999 non-null float64 29 SEC.OVERDUE.ACCTS 129999 non-null float64 30 SEC.CURRENT.BALANCE 129999 non-null float64 31 SEC.SANCTIONED.AMOUNT 129999 non-null float64 32 SEC.DISBURSED.AMOUNT 129999 non-null float64 33 PRIMARY.INSTAL.AMT 129999 non-null float64 34 SEC.INSTAL.AMT 129999 non-null float64 35 NEW.ACCTS.IN.LAST.SIX.MONTHS 129999 non-null float64 36 DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS 129999 non-null float64 37 AVERAGE.ACCT.AGE 129999 non-null object 38 CREDIT.HISTORY.LENGTH 129999 non-null object 39 NO.OF_INQUIRIES 129999 non-null float64 40 loan_default 129999 non-null float64 dtypes: float64(35), object(6)
dataset_shape_info(test)
.: Dataset Info :. ******************** Total Rows: 112392 Total Columns: 40 ******************** .: Dataset Details :. ********************** <class 'pandas.core.frame.DataFrame'> RangeIndex: 112392 entries, 0 to 112391 Data columns (total 40 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 UniqueID 112392 non-null int64 1 disbursed_amount 112392 non-null int64 2 asset_cost 112392 non-null int64 3 ltv 112392 non-null float64 4 branch_id 112392 non-null int64 5 supplier_id 112392 non-null int64 6 manufacturer_id 112392 non-null int64 7 Current_pincode_ID 112392 non-null int64 8 Date.of.Birth 112392 non-null object 9 Employment.Type 108949 non-null object 10 DisbursalDate 112392 non-null object 11 State_ID 112392 non-null int64 12 Employee_code_ID 112392 non-null int64 13 MobileNo_Avl_Flag 112392 non-null int64 14 Aadhar_flag 112392 non-null int64 15 PAN_flag 112392 non-null int64 16 VoterID_flag 112392 non-null int64 17 Driving_flag 112392 non-null int64 18 Passport_flag 112392 non-null int64 19 PERFORM_CNS.SCORE 112392 non-null int64 20 PERFORM_CNS.SCORE.DESCRIPTION 112392 non-null object 21 PRI.NO.OF.ACCTS 112392 non-null int64 22 PRI.ACTIVE.ACCTS 112392 non-null int64 23 PRI.OVERDUE.ACCTS 112392 non-null int64 24 PRI.CURRENT.BALANCE 112392 non-null int64 25 PRI.SANCTIONED.AMOUNT 112392 non-null int64 26 PRI.DISBURSED.AMOUNT 112392 non-null int64 27 SEC.NO.OF.ACCTS 112392 non-null int64 28 SEC.ACTIVE.ACCTS 112392 non-null int64 29 SEC.OVERDUE.ACCTS 112392 non-null int64 30 SEC.CURRENT.BALANCE 112392 non-null int64 31 SEC.SANCTIONED.AMOUNT 112392 non-null int64 32 SEC.DISBURSED.AMOUNT 112392 non-null int64 33 PRIMARY.INSTAL.AMT 112392 non-null int64 34 SEC.INSTAL.AMT 112392 non-null int64 35 NEW.ACCTS.IN.LAST.SIX.MONTHS 112392 non-null int64 36 DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS 112392 non-null int64 37 AVERAGE.ACCT.AGE 112392 non-null object 38 CREDIT.HISTORY.LENGTH 112392 non-null object 39 NO.OF_INQUIRIES 112392 non-null int64 dtypes: float64(1), int64(33), object(6)
#if category exist then there distribution and nuniques in general
print('\033[36m*' * 29+'\033[0m')
for each_column in train.columns:
each_column_nunique = train[each_column].nunique()
print(f"Column: [ {each_column} ] nuniques are: {each_column_nunique}\n")
#if category are less then 3 print them with count
if each_column_nunique <= 3:
print(sorted(train[each_column].value_counts().to_dict().items()),'\n')
print('\033[36m*' * 29+'\033[0m')
***************************** Column: [ UniqueID ] nuniques are: 129999 ***************************** Column: [ disbursed_amount ] nuniques are: 17496 ***************************** Column: [ asset_cost ] nuniques are: 33734 ***************************** Column: [ ltv ] nuniques are: 6158 ***************************** Column: [ branch_id ] nuniques are: 64 ***************************** Column: [ supplier_id ] nuniques are: 2116 ***************************** Column: [ manufacturer_id ] nuniques are: 10 ***************************** Column: [ Current_pincode_ID ] nuniques are: 4877 ***************************** Column: [ Date.of.Birth ] nuniques are: 14458 ***************************** Column: [ Employment.Type ] nuniques are: 2 [('Salaried', 55537), ('Self employed', 69926)] ***************************** Column: [ DisbursalDate ] nuniques are: 84 ***************************** Column: [ State_ID ] nuniques are: 20 ***************************** Column: [ Employee_code_ID ] nuniques are: 1337 ***************************** Column: [ MobileNo_Avl_Flag ] nuniques are: 1 [(1.0, 129999)] ***************************** Column: [ Aadhar_flag ] nuniques are: 2 [(0.0, 23582), (1.0, 106417)] ***************************** Column: [ PAN_flag ] nuniques are: 2 [(0.0, 118848), (1.0, 11151)] ***************************** Column: [ VoterID_flag ] nuniques are: 2 [(0.0, 108404), (1.0, 21595)] ***************************** Column: [ Driving_flag ] nuniques are: 2 [(0.0, 126614), (1.0, 3385)] ***************************** Column: [ Passport_flag ] nuniques are: 2 [(0.0, 129682), (1.0, 317)] ***************************** Column: [ PERFORM_CNS.SCORE ] nuniques are: 568 ***************************** Column: [ PERFORM_CNS.SCORE.DESCRIPTION ] nuniques are: 20 ***************************** Column: [ PRI.NO.OF.ACCTS ] nuniques are: 94 ***************************** Column: [ PRI.ACTIVE.ACCTS ] nuniques are: 37 ***************************** Column: [ PRI.OVERDUE.ACCTS ] nuniques are: 20 ***************************** Column: [ PRI.CURRENT.BALANCE ] nuniques are: 42081 ***************************** Column: [ PRI.SANCTIONED.AMOUNT ] nuniques are: 26666 ***************************** Column: [ PRI.DISBURSED.AMOUNT ] nuniques are: 28626 ***************************** Column: [ SEC.NO.OF.ACCTS ] nuniques are: 33 ***************************** Column: [ SEC.ACTIVE.ACCTS ] nuniques are: 20 ***************************** Column: [ SEC.OVERDUE.ACCTS ] nuniques are: 8 ***************************** Column: [ SEC.CURRENT.BALANCE ] nuniques are: 1835 ***************************** Column: [ SEC.SANCTIONED.AMOUNT ] nuniques are: 1338 ***************************** Column: [ SEC.DISBURSED.AMOUNT ] nuniques are: 1520 ***************************** Column: [ PRIMARY.INSTAL.AMT ] nuniques are: 19178 ***************************** Column: [ SEC.INSTAL.AMT ] nuniques are: 1112 ***************************** Column: [ NEW.ACCTS.IN.LAST.SIX.MONTHS ] nuniques are: 24 ***************************** Column: [ DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS ] nuniques are: 13 ***************************** Column: [ AVERAGE.ACCT.AGE ] nuniques are: 183 ***************************** Column: [ CREDIT.HISTORY.LENGTH ] nuniques are: 267 ***************************** Column: [ NO.OF_INQUIRIES ] nuniques are: 24 ***************************** Column: [ loan_default ] nuniques are: 2 [(0.0, 103107), (1.0, 26892)] *****************************
EDA and feature engineering¶#target
target = train['loan_default']
###Features
target_features = ['loan_default']
id_features = ['UniqueID','branch_id','supplier_id','Current_pincode','Employee_code_ID']
date_time_features = ['Date.of.Birth','DisbursalDate','CREDIT.HISTORY.LENGTH','AVERAGE.ACCT.AGE']
categorical_features = ['Employment.Type','PERFORM_CNS.SCORE.DESCRIPTION','manufacturer_id','State_ID','MobileNo_Avl_Flag',
'Aadhar_flag','PAN_flag','VoterID_flag','Driving_flag','Passport_flag']
segrigated_features = target_features + id_features + date_time_features + categorical_features
numerical_features = [particular_colum for particular_colum in train.columns if particular_colum not in segrigated_features]
NOTE
We will EDA now based on these features and will try to get every required information out of these features
pd.isnull(train['branch_id']).sum()
394
import matplotlib.ticker as ticker
import seaborn as sns
#function to visualise { Categorical VAriables }
def cat_var_eda(data,features):
"""
[ Univariate Analysis For Categorical Variables ]
Will take a group of variables-[CATEGORY] and will plot or print:
1. Bar Plot
2. value_counts
"""
#setting figure size and all
fig_size = len(features)
fig = plt.figure(figsize=(12*fig_size,5),dpi=100)
fig.canvas.draw()
plt.tight_layout(pad=1.08,h_pad=None,w_pad=100000,rect=None)
for index,values in enumerate(features):
plt.subplot(1, fig_size, index + 1)
ax = sns.countplot(x=values, data=data, orient='h') #horizontal countplot
plt.xlabel(f"{values}", fontsize=12)
not_null_count = data.shape[0] - pd.isnull(data[values]).sum()
#twin axis formation
ax2 = ax.twinx()
#count axis on right, frequency on left
ax2.yaxis.tick_left()
ax.yaxis.tick_right()
#also switching labels over
ax.yaxis.set_label_position('right')
ax2.yaxis.set_label_position('left')
ax2.set_ylabel('Frequency [%]')
for p in ax.patches:
x = p.get_bbox().get_points()[:,0]
y = p.get_bbox().get_points()[1,1]
ax.annotate('{:.1f}'.format(100. * y / not_null_count),
(x.mean(),y), ha='center', va='bottom') #setting text allignment
#using a LinearLocator to ensure correct number of ticks
ax.yaxis.set_major_locator(ticker.LinearLocator(11))
ax.set_ylim(0,not_null_count)
ax2.set_ylim(0,180)
#tick spacing of 10
ax2.yaxis.set_major_locator(ticker.MultipleLocator(10))
ax2.grid(alpha=0.3)
plt.show
loan_default --> Target¶#target variable
cat_var_eda(train,['loan_default'])
train.shape[0]
130393
Imbalanced classification as expected from a loan default use case
State_ID¶#State_ID
cat_var_eda(train,['State_ID'])
Obsevation:
Most loan are taken from customer belonging to state 4,3,6,13,9 and 8
As most of the earning comes out of some of the state as not all state have same earning and industry and stufff stuff.
manufacturer_id¶#manufacturer_id
cat_var_eda(train,['manufacturer_id'])
Observation:
manufacturer_id - 120,49 have almost similar number of customersmanufacturer_id - 86,45 have more than 70% of market share combinedmanufacturer_id - 86,45,51 have more than 90% of market share combinedmanufacturer_id - 152,153,156 have 0% of market shareEmployment.Type¶#Employment.Type
cat_var_eda(train,['Employment.Type'])
Observation:
More customers belong to self employed category based on this plot
There are some missing values as well but not significant in this case
#3% null value not much significant as discussed
(train['Employment.Type'].isnull().sum() / train.shape[0])*100
3.780877807857784
Identification Document Flags¶Either id was given or not
#using melt as we have more then 1 id variables
diff_flags_uniques = pd.melt(frame=train, value_vars=['MobileNo_Avl_Flag','Aadhar_flag','PAN_flag',
'VoterID_flag','Driving_flag','Passport_flag'])
diff_flags_uniques.head(5)
| variable | value | |
|---|---|---|
| 0 | MobileNo_Avl_Flag | 1.0 |
| 1 | MobileNo_Avl_Flag | 1.0 |
| 2 | MobileNo_Avl_Flag | 1.0 |
| 3 | MobileNo_Avl_Flag | 1.0 |
| 4 | MobileNo_Avl_Flag | 1.0 |
df_diff_flags_uniques = pd.DataFrame(diff_flags_uniques.groupby(['variable','value'])['value'].count()).sort_index(level=[0,1])\
.rename(columns={'value':'count'}).reset_index()
df_diff_flags_uniques.head(5)
| variable | value | count | |
|---|---|---|---|
| 0 | Aadhar_flag | 0.0 | 23582 |
| 1 | Aadhar_flag | 1.0 | 106417 |
| 2 | Driving_flag | 0.0 | 126614 |
| 3 | Driving_flag | 1.0 | 3385 |
| 4 | MobileNo_Avl_Flag | 1.0 | 129999 |
#visualizing
sns.catplot(x='variable', y='count', hue='value', data=df_diff_flags_uniques, kind='bar')
plt.xticks(rotation=45)
plt.show()
Observation:
Using Kernel Density Estimation-KDE plot
#this is a list of all features/variaibles which we have to usnderstand
print(numerical_features)
['disbursed_amount', 'asset_cost', 'ltv', 'Current_pincode_ID', 'PERFORM_CNS.SCORE', 'PRI.NO.OF.ACCTS', 'PRI.ACTIVE.ACCTS', 'PRI.OVERDUE.ACCTS', 'PRI.CURRENT.BALANCE', 'PRI.SANCTIONED.AMOUNT', 'PRI.DISBURSED.AMOUNT', 'SEC.NO.OF.ACCTS', 'SEC.ACTIVE.ACCTS', 'SEC.OVERDUE.ACCTS', 'SEC.CURRENT.BALANCE', 'SEC.SANCTIONED.AMOUNT', 'SEC.DISBURSED.AMOUNT', 'PRIMARY.INSTAL.AMT', 'SEC.INSTAL.AMT', 'NEW.ACCTS.IN.LAST.SIX.MONTHS', 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS', 'NO.OF_INQUIRIES']
#fucntoin Univariate Numerical Variables Analysis
def num_var_eda(data,features,log=False,dpi=110):
fig_size = len(features)
plt.figure(figsize=(8*fig_size,5),dpi=dpi)
for index,values in enumerate(features):
#calculating descriptives of variable
minimum = data[values].min()
maximum = data[values].max()
mean = data[values].mean()
median = data[values].median()
st_dev = data[values].std()
#calculating points of one standard deviation
points = mean - st_dev,mean + st_dev
#plotting variable with every information
plt.subplot(1,fig_size,index + 1)
ax = sns.kdeplot(data[values],shade=True) #KDE plot
if log == True:
ax.set_xscale('log')
else:
pass
plt.xlabel(f'{values}',fontsize=13)
plt.ylabel('density')
plt.title(f'std_dev={(round(points[0],2),round(points[1],2))}, range={(round(minimum,2),round(maximum,2))}\nmean={round(mean,2)}, median={round(median,2)}')
#without log
num_var_eda(train,['disbursed_amount','asset_cost'])
Observation
asset_cost is more then disbursed_amount it is a valid output ==> most of customer do some down payment they never take loan on initeal amountBoth KDE plot are Right Skewed here are some outliers, will apply log scaling on x-axis
#with log
num_var_eda(train,['disbursed_amount','asset_cost'],log=True)
Observation:
disbursed_amoun & asset_cost values lie between appx-10000 to 100000 which is expected from a two wheeler loanCustomer generaly dont take all money in loan, there is somthing downpament involved
NOTE:
Box-plot can be used here
num_var_eda(train,['ltv'],log=False)
Observation
#column mens no beauro data for these peoples
num_var_eda(train,['PERFORM_CNS.SCORE'])
Observation:
cns score descriptiontrain['PERFORM_CNS.SCORE'].value_counts()
0.0 65595
738.0 5074
300.0 4624
825.0 4492
15.0 2142
...
849.0 1
862.0 1
822.0 1
884.0 1
840.0 1
Name: PERFORM_CNS.SCORE, Length: 568, dtype: int64
train['PERFORM_CNS.SCORE.DESCRIPTION'].value_counts(normalize=True)
Vechile loans are generaly low ammount so this validates that most of customer do not have any previous credit history moany of user can be from villages
#this feature is categorial in natures but most related to 'PERFORM_CNS.SCORE' so I am exploring it now
train['PERFORM_CNS.SCORE.DESCRIPTION'].value_counts()
No Bureau History Available 65595 C-Very Low Risk 9163 A-Very Low Risk 8254 D-Very Low Risk 6354 B-Very Low Risk 5239 M-Very High Risk 4624 F-Low Risk 4583 K-High Risk 4503 H-Medium Risk 3689 E-Low Risk 3163 I-Medium Risk 3012 G-Low Risk 2173 Not Scored: Sufficient History Not Available 2142 J-High Risk 1992 Not Scored: Not Enough Info available on the customer 1947 Not Scored: No Activity seen on the customer (Inactive) 1598 Not Scored: No Updates available in last 36 months 857 L-Very High Risk 585 Not Scored: Only a Guarantor 524 Not Scored: More than 50 active Accounts found 2 Name: PERFORM_CNS.SCORE.DESCRIPTION, dtype: int64
Hunch was right as we can see clearly that most of users do not possess a bureau history
Will `group by this category and check average of score for each category and to interpret it better
#summary for each category
customer_score_summ = pd.DataFrame(train.groupby('PERFORM_CNS.SCORE.DESCRIPTION')['PERFORM_CNS.SCORE'].mean()).reset_index().sort_values(by='PERFORM_CNS.SCORE',ascending=False)
with pd.option_context('display.max_rows',None):
display(customer_score_summ)
| PERFORM_CNS.SCORE.DESCRIPTION | PERFORM_CNS.SCORE | |
|---|---|---|
| 0 | A-Very Low Risk | 827.453356 |
| 1 | B-Very Low Risk | 774.065661 |
| 2 | C-Very Low Risk | 741.917167 |
| 3 | D-Very Low Risk | 715.919421 |
| 4 | E-Low Risk | 691.586785 |
| 5 | F-Low Risk | 666.100589 |
| 6 | G-Low Risk | 640.646572 |
| 7 | H-Medium Risk | 617.029547 |
| 8 | I-Medium Risk | 586.755644 |
| 9 | J-High Risk | 549.430723 |
| 10 | K-High Risk | 440.499445 |
| 11 | L-Very High Risk | 326.970940 |
| 12 | M-Very High Risk | 300.000000 |
| 16 | Not Scored: No Updates available in last 36 mo... | 18.000000 |
| 17 | Not Scored: Not Enough Info available on the c... | 17.000000 |
| 15 | Not Scored: No Activity seen on the customer (... | 16.000000 |
| 19 | Not Scored: Sufficient History Not Available | 15.000000 |
| 18 | Not Scored: Only a Guarantor | 14.000000 |
| 14 | Not Scored: More than 50 active Accounts found | 11.000000 |
| 13 | No Bureau History Available | 0.000000 |
Observation:
Loan taken by user itself features
primary_loan_features = ['PRI.NO.OF.ACCTS','PRI.ACTIVE.ACCTS','PRI.OVERDUE.ACCTS','PRI.CURRENT.BALANCE','PRI.SANCTIONED.AMOUNT',
'PRI.DISBURSED.AMOUNT']
#total accounts & active accounts for each customer
num_var_eda(train,['PRI.NO.OF.ACCTS','PRI.ACTIVE.ACCTS'], log=False)
#not continious in nature
train['PRI.NO.OF.ACCTS'].head()
0 0.0 1 1.0 2 0.0 3 3.0 4 0.0 Name: PRI.NO.OF.ACCTS, dtype: float64
Observation
These variables are not continious that is why you see see above plotes
We can use Histogram in this case
#visualising Primary loan features
num_var_eda(train,['PRI.DISBURSED.AMOUNT','PRI.SANCTIONED.AMOUNT','PRI.CURRENT.BALANCE','PRIMARY.INSTAL.AMT'],log=True)
Observation
Check-1
PRI.ACTIVE.ACCTSPRI.NO.OF.ACCTSSo PRI.ACTIVE.ACCTS cannot be greater then PRI.NO.OF.ACCTS
#Check 1
train[train['PRI.NO.OF.ACCTS'] < train['PRI.ACTIVE.ACCTS']]
| UniqueID | disbursed_amount | asset_cost | ltv | branch_id | supplier_id | manufacturer_id | Current_pincode_ID | Date.of.Birth | Employment.Type | DisbursalDate | State_ID | Employee_code_ID | MobileNo_Avl_Flag | Aadhar_flag | PAN_flag | VoterID_flag | Driving_flag | Passport_flag | PERFORM_CNS.SCORE | PERFORM_CNS.SCORE.DESCRIPTION | PRI.NO.OF.ACCTS | PRI.ACTIVE.ACCTS | PRI.OVERDUE.ACCTS | PRI.CURRENT.BALANCE | PRI.SANCTIONED.AMOUNT | PRI.DISBURSED.AMOUNT | SEC.NO.OF.ACCTS | SEC.ACTIVE.ACCTS | SEC.OVERDUE.ACCTS | SEC.CURRENT.BALANCE | SEC.SANCTIONED.AMOUNT | SEC.DISBURSED.AMOUNT | PRIMARY.INSTAL.AMT | SEC.INSTAL.AMT | NEW.ACCTS.IN.LAST.SIX.MONTHS | DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS | AVERAGE.ACCT.AGE | CREDIT.HISTORY.LENGTH | NO.OF_INQUIRIES | loan_default |
|---|
Nothing means there is no problem in our data
#sanity 2
train[train['PRI.NO.OF.ACCTS'] < train['PRI.OVERDUE.ACCTS']]
| UniqueID | disbursed_amount | asset_cost | ltv | branch_id | supplier_id | manufacturer_id | Current_pincode_ID | Date.of.Birth | Employment.Type | DisbursalDate | State_ID | Employee_code_ID | MobileNo_Avl_Flag | Aadhar_flag | PAN_flag | VoterID_flag | Driving_flag | Passport_flag | PERFORM_CNS.SCORE | PERFORM_CNS.SCORE.DESCRIPTION | PRI.NO.OF.ACCTS | PRI.ACTIVE.ACCTS | PRI.OVERDUE.ACCTS | PRI.CURRENT.BALANCE | PRI.SANCTIONED.AMOUNT | PRI.DISBURSED.AMOUNT | SEC.NO.OF.ACCTS | SEC.ACTIVE.ACCTS | SEC.OVERDUE.ACCTS | SEC.CURRENT.BALANCE | SEC.SANCTIONED.AMOUNT | SEC.DISBURSED.AMOUNT | PRIMARY.INSTAL.AMT | SEC.INSTAL.AMT | NEW.ACCTS.IN.LAST.SIX.MONTHS | DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS | AVERAGE.ACCT.AGE | CREDIT.HISTORY.LENGTH | NO.OF_INQUIRIES | loan_default |
|---|
#checking how many of loan holders have primary account information
train['no_prim_loan_flag'] = train['PRI.NO.OF.ACCTS'].mask(train['PRI.NO.OF.ACCTS'] > 0,1) #if number of account-[PRI.NO.OF.ACCTS] greater then 1 put 1 there
#checking how many customers have active & Overdue accounts
train['no_active_prim_loan_flag'] = train['PRI.ACTIVE.ACCTS'].mask(train['PRI.ACTIVE.ACCTS'] > 0,1)
train['no_overdue_prim_loan_flag'] = train['PRI.OVERDUE.ACCTS'].mask(train['PRI.OVERDUE.ACCTS'] > 0,1)
#collectively visualise above 3 variables
df_uniques = pd.melt(frame=train, value_vars=['no_prim_loan_flag','no_active_prim_loan_flag','no_overdue_prim_loan_flag'])
df_uniques = pd.DataFrame(df_uniques.groupby(['variable', 'value'])['value'].count()).sort_index(level=[0, 1])\
.rename(columns={'value': 'count'}).reset_index()
#catplot
sns.catplot(x='variable', y='count', hue='value', data=df_uniques, kind='bar')
plt.xticks(rotation=25)
plt.show()
Observation
train['PERFORM_CNS.SCORE'].value_counts()
0.0 65595
738.0 5074
300.0 4624
825.0 4492
15.0 2142
...
849.0 1
862.0 1
822.0 1
884.0 1
840.0 1
Name: PERFORM_CNS.SCORE, Length: 568, dtype: int64
no_prim_loan_flag ==> people who have primary account == 1
#user who dont have no_primary_loan_accounts what is there score description
train[train['no_prim_loan_flag']==0]['PERFORM_CNS.SCORE.DESCRIPTION'].value_counts(normalize=True)
No Bureau History Available 1.0 Name: PERFORM_CNS.SCORE.DESCRIPTION, dtype: float64
Concluding
From primary loan information and basic_beauro infromation can say 50% of customers dont have any credit history, we have validated it up till now
sec_loan_features = ['SEC.NO.OF.ACCTS','SEC.ACTIVE.ACCTS','SEC.OVERDUE.ACCTS','SEC.CURRENT.BALANCE',
'SEC.SANCTIONED.AMOUNT','SEC.DISBURSED.AMOUNT','SEC.INSTAL.AMT']
train['SEC.DISBURSED.AMOUNT'].value_counts(normalize=True)
0.0 0.984038
50000.0 0.000285
100000.0 0.000185
200000.0 0.000169
40000.0 0.000146
...
13400.0 0.000008
248537.0 0.000008
471000.0 0.000008
15604.0 0.000008
15441.0 0.000008
Name: SEC.DISBURSED.AMOUNT, Length: 1520, dtype: float64
Observation:
#checking other flags from bureau data
train['new_acct_flag'] = train['NEW.ACCTS.IN.LAST.SIX.MONTHS'].mask(train['NEW.ACCTS.IN.LAST.SIX.MONTHS'] > 0,1)
train['delinquent_flag'] = train['DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS'].mask(train['DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS'] > 0,1)
train['inquiry_flag'] = train['NO.OF_INQUIRIES'].mask(train['NO.OF_INQUIRIES'] > 0,1)
#plotting same information
df_uniques = pd.melt(frame=train,value_vars=['new_acct_flag','delinquent_flag','inquiry_flag','no_overdue_prim_loan_flag'])
df_uniques = pd.DataFrame(df_uniques.groupby(['variable','value'])['value'].count()).sort_index(level=[0, 1]) \
.rename(columns={'value': 'count'}).reset_index()
#catplot
sns.catplot(x='variable', y='count', hue='value', data=df_uniques, kind='bar')
plt.xticks(rotation=25)
plt.show()
Observation
From above analysis we can say 50% user dont have previous account information and then for rest of 50% hae this information which can be inderstood from above plot
Let us explore ID features now and see if we have some interesting information
#list of all id features
id_features
['UniqueID', 'branch_id', 'supplier_id', 'Current_pincode', 'Employee_code_ID']
#uniqueid ==> unique for each customer must be same as numbe of rows in data
train['UniqueID'].nunique(), train.shape[0]
(233154, 233154)
Observation ==> no repeated customer found
# employ
train.Employee_code_ID.nunique()
3270
Observation
Employ who have dispursed loan in any manny are 3270
#[[ Do all employees work in a single branch????? ]]
train[['branch_id','Employee_code_ID']].drop_duplicates().groupby('Employee_code_ID')['branch_id'].count().\
reset_index()['branch_id'].value_counts()
branch_id 1 3109 2 161 Name: count, dtype: int64
Observatin
161-Employs are mapped to multiple branches oops, Problem found
pincode
if pincode have any sequence??
train.Current_pincode_ID.head()
0 1441 1 1502 2 1497 3 1501 4 1495 Name: Current_pincode_ID, dtype: int64
There are randome number not pincode, our task is to find out if these number make any sense
Sharing pincode can reveal location of customer there migh be any clause underwhich bank cannot share exact pin code, but still there will be some paterns as 1441 will be near to 1442 and kinde of thing, lets find out
#range of number for every branch_id
train.groupby('branch_id').agg(min_pin=('Current_pincode_ID',min),max_pin=('Current_pincode_ID',max))
| min_pin | max_pin | |
|---|---|---|
| branch_id | ||
| 1 | 4906 | 6146 |
| 2 | 1593 | 2398 |
| 3 | 1 | 604 |
| 5 | 3299 | 3588 |
| 7 | 5725 | 5858 |
| ... | ... | ... |
| 257 | 936 | 970 |
| 258 | 125 | 178 |
| 259 | 216 | 258 |
| 260 | 4153 | 4288 |
| 261 | 179 | 214 |
82 rows × 2 columns
Observation
On basis of above table Current_pincode_ID has a sequence, they are not randomly assigned numbers
Current_pincode_ID is important feature for model building
To make this finding more impact full lets do same with 'supplier_id'
#range of number for every supplier_id
train.groupby('supplier_id').agg(min_pin=('Current_pincode_ID',min),max_pin=('Current_pincode_ID',max)).sort_index()
| min_pin | max_pin | |
|---|---|---|
| supplier_id | ||
| 10524 | 5368 | 5425 |
| 12311 | 5368 | 5426 |
| 12312 | 5064 | 5221 |
| 12374 | 5108 | 5219 |
| 12441 | 5368 | 5476 |
| ... | ... | ... |
| 24794 | 6982 | 6982 |
| 24797 | 727 | 776 |
| 24799 | 104 | 104 |
| 24802 | 1467 | 1471 |
| 24803 | 5471 | 5473 |
2953 rows × 2 columns
Observation
Current_pincode_ID has a sequence#range of number for every Employee_code_ID
train.groupby('Employee_code_ID').agg(min_pin=('Current_pincode_ID',min),max_pin=('Current_pincode_ID',max)).sort_index()
| min_pin | max_pin | |
|---|---|---|
| Employee_code_ID | ||
| 1 | 3 | 100 |
| 3 | 5939 | 6010 |
| 4 | 2679 | 2769 |
| 5 | 5060 | 5114 |
| 7 | 6158 | 6379 |
| ... | ... | ... |
| 3791 | 820 | 828 |
| 3792 | 3003 | 3003 |
| 3793 | 4638 | 4638 |
| 3794 | 248 | 248 |
| 3795 | 3675 | 3675 |
3270 rows × 2 columns
Observation
See Eemploy with id 1 works with 3-100 pin reason customers only
Concluding
All employees and suppliers are generally operating in a single location but only ids for pin have meaning
Checking frequency of each branch ID, Supplier ID & Employee ID¶train['branch_id'].value_counts()
branch_id
2 13138
67 11328
3 9230
5 9218
36 8832
...
217 183
261 176
84 156
111 89
158 69
Name: count, Length: 82, dtype: int64
plt.figure(figsize=(8,3),dpi=100)
plt.hist(train['branch_id'].value_counts(),bins=40)
plt.show()
plt.figure(figsize=(8,3),dpi=100)
plt.hist(train['supplier_id'].value_counts().values,bins=200,log=False)
plt.show()
plt.figure(figsize=(8,3),dpi=100)
plt.hist(train['Employee_code_ID'].value_counts().values,bins=200,log=False)
plt.xlabel('total_customer_count')
plt.ylabel('employ_count')
plt.show()
Most of employ are working with single loan
#all datetime features list
date_time_features
['Date.of.Birth', 'DisbursalDate', 'CREDIT.HISTORY.LENGTH', 'AVERAGE.ACCT.AGE']
DisbursalDate
train['DisbursalDate'] = pd.to_datetime(train['DisbursalDate'], format='%d-%m-%y') #train data
test['DisbursalDate'] = pd.to_datetime(test['DisbursalDate'], format='%d-%m-%y') #test data
#Date.of.Birth feature
train['Date.of.Birth'] = pd.to_datetime(train['Date.of.Birth'], format='%d-%m-%y')
test['Date.of.Birth'] = pd.to_datetime(test['Date.of.Birth'], format='%d-%m-%y')
#train
train['DisbursalDate'].describe()
count 233154 mean 2018-09-23 09:57:53.079596032 min 2018-08-01 00:00:00 25% 2018-08-30 00:00:00 50% 2018-09-25 00:00:00 75% 2018-10-21 00:00:00 max 2018-10-31 00:00:00 Name: DisbursalDate, dtype: object
#test
test['DisbursalDate'].describe()
count 112392 mean 2018-11-18 06:01:39.935938816 min 2018-11-03 00:00:00 25% 2018-11-13 00:00:00 50% 2018-11-17 00:00:00 75% 2018-11-23 00:00:00 max 2018-11-30 00:00:00 Name: DisbursalDate, dtype: object
Observation
DisbursalDates in test dataset lie in month of November 2018DisbursalDates in test dataset lie in future of all DisbursalDates in train dataset# finding age in years
train['age_in_days'] = (train.DisbursalDate - train['Date.of.Birth']).dt.days
train['age_in_years'] = train['age_in_days']/365.25 #every 4 year have 1 more extra day-leap year so .25 i.e 1/4
#extracting more features out of disbursal date
train['disbursal_day'] = train.DisbursalDate.dt.day
train['disbursal_month'] = train.DisbursalDate.dt.month
train['disbursed_dayofweek'] = train.DisbursalDate.dt.dayofweek
#Disbursal Month Distribution
plt.figure(figsize=(8,3),dpi=100)
plt.hist(train['disbursal_month'])
plt.xlabel('month')
plt.ylabel('loan_disbursed')
plt.show()
#disbursal Day Distribution
plt.figure(figsize=(8,3),dpi=100)
plt.hist(train['disbursal_day'])
plt.xlabel('day')
plt.ylabel('loan_disbursed')
plt.show()
Observation
Loan generally get disbursed on last day of month, general practice, graph validates that so no further invest
#Disbursal Day of Week Distribution
plt.figure(figsize=(8,3),dpi=100)
plt.hist(train['disbursed_dayofweek'])
plt.xlabel('day_of_week')
plt.ylabel('loan_disbursed')
plt.show()
#distribution of age
plt.figure(figsize=(8,3),dpi=100)
plt.hist(np.round(train['age_in_years'],5),bins=50)
plt.xlabel('age_in_years')
plt.ylabel('customer_count')
plt.show()
Observation:
Customer less then 18 cant get a loan on themself, so plot validates it we are going good and +ve
User in age bracket of 22 - 30 takes most of two wheler loan
Date.of.Birth'
train['Date.of.Birth'].describe()
count 233154 mean 1992-10-11 07:40:26.032579328 min 1969-01-01 00:00:00 25% 1980-10-05 00:00:00 50% 1988-05-04 00:00:00 75% 1994-04-26 00:00:00 max 2068-12-31 00:00:00 Name: Date.of.Birth, dtype: object
from datetime import timedelta, date
print(date(year=2019,month=1,day=1))
print(timedelta(days=365.25*100))
2019-01-01 36525 days, 0:00:00
#100 years
36525/365.25
100.0
# if date of birth is more then 2019
filter_date = train['Date.of.Birth'].dt.date > date(year=2019,month=1,day=1)
train.loc[filter_date,'Date.of.Birth'] -= timedelta(days=365.25*100) #100 years
train['Date.of.Birth'].describe()
count 233154 mean 1984-04-04 04:32:39.947502464 min 1949-09-15 00:00:00 25% 1977-05-04 00:00:00 50% 1986-01-01 00:00:00 75% 1992-05-19 00:00:00 max 2000-10-20 00:00:00 Name: Date.of.Birth, dtype: object
#birth Day and month exploration
train['birth_day'] = train['Date.of.Birth'].dt.day
train['birth_month'] = train['Date.of.Birth'].dt.month
#distribution of birth day
plt.figure(figsize=(8,3),dpi=100)
plt.hist(train['birth_day'])
plt.xlabel('month_date')
plt.ylabel('birt_count')
plt.show()
Observation
How it possible that most of user have birt date of 1-3, lets get deep into it
#distribution of birth day
plt.figure(figsize=(8,3),dpi=100)
plt.hist(train['birth_month'])
plt.xlabel('month')
plt.ylabel('birt_count')
plt.show()
Observation
Most of user are born on 1-jan we can conclude, there is some problem
Lets see how it effects target variable, Are these customer defaulting more
For this I will Digress to multivariated analysis
#finding loan default %tg for customers whose birth_month and birth_day is 1
train[(train.birth_day == 1) & (train.birth_month == 1)]['loan_default'].mean()
0.2572111467217122
Observation
There are 25% of such user whose birth_month and birth_day is 1 and they are defaulter
#overall default rate
train['loan_default'].mean()
0.2170711203753742
Observation
Overall default rate is less i.e. 22% appx
Concluding
User whose birth_month and birth_day is 1 they will be defaulter 25% of time
CREDIT.HISTORY.LENGTH and AVERAGE.ACCT.AGE
Converting these features in number now
train['CREDIT.HISTORY.LENGTH']
0 0yrs 0mon
1 1yrs 11mon
2 0yrs 0mon
3 1yrs 3mon
4 0yrs 0mon
...
233149 3yrs 3mon
233150 0yrs 6mon
233151 0yrs 0mon
233152 0yrs 0mon
233153 0yrs 0mon
Name: CREDIT.HISTORY.LENGTH, Length: 233154, dtype: object
Exploring this above data is hard so converting it into number
import re
#using regex to extract number from text
def map_credit_history_to_months(row_data):
re_data = list(map(int, re.findall(r'\d+',row_data)))
return re_data[0]*12 + re_data[1] #extract first number mult by 12 add months with it
#applying above function
train['credit_history_in_months'] = train['CREDIT.HISTORY.LENGTH'].apply(map_credit_history_to_months)
train['avg_acct_age_in_months'] = train['AVERAGE.ACCT.AGE'].apply(map_credit_history_to_months)
#credit_history_in_months
train['credit_history_in_months']
0 0
1 23
2 0
3 15
4 0
..
233149 39
233150 6
233151 0
233152 0
233153 0
Name: credit_history_in_months, Length: 233154, dtype: int64
train.credit_history_in_months.describe()
count 233154.000000 mean 16.252404 std 28.581255 min 0.000000 25% 0.000000 50% 0.000000 75% 24.000000 max 468.000000 Name: credit_history_in_months, dtype: float64
plt.figure(figsize=(8,3),dpi=100)
plt.hist(train['credit_history_in_months'].value_counts().values,bins=200)
plt.show()
plt.figure(figsize=(8,3),dpi=100)
plt.hist(train['avg_acct_age_in_months'].value_counts().values,bins=200)
plt.show()
from scipy.stats import chi2_contingency
Applying $chi^2$ test to see if there is significan relationship between target and other categorical variable
def biv_cat_cat_analysis(data,target_,cat,sort=False):
data_copy = data.copy()
if sort == True:
df = data[[cat,target_]].groupby([cat][target_].count().reset_index(name='count').sort_values(['count']),ascending=False)
data_copy = data_copy.merge(df, on=cat, how='left')
data = data_copy.sort_values(by='count', ascending=False)
data.drop(cat,axis=1, inplace=True)
data[cat] = data['count'].rank(ascending=False)
else:
pass
data = data[[cat,target_]][:]
#crosstab
table = pd.crosstab(data[target_], data[cat],)
f_obs = np.array([table.iloc[0][:].values, table.iloc[1][:].values])
#### Performing chi2-test ####
chi, p, dof, expected = chi2_contingency(f_obs)
if p < 0.05:
sig = True
else:
sig = False
sns.countplot(x=cat, hue=target_, data=data)
plt.title(f"p-value = {round(p,8)}\n difference significant? = {sig}\n")
df = data.groupby(cat)[target_].value_counts(normalize=True).unstack()
df.plot(kind='bar', stacked='True',title=str(df))
int_level = data[cat].value_counts()
biv_cat_cat_analysis(train,target_='loan_default',cat='Employment.Type')
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[95], line 1 ----> 1 biv_cat_cat_analysis(train,target_='loan_default',cat='Employment.Type') Cell In[94], line 25, in biv_cat_cat_analysis(data, target_, cat, sort) 22 else: 23 sig = False ---> 25 sns.countplot(x=cat, hue=target_, data=data) 26 plt.title(f"p-value = {round(p,8)}\n difference significant? = {sig}\n") 28 df = data.groupby(cat)[target_].value_counts(normalize=True).unstack() File ~\anaconda3\Lib\site-packages\seaborn\categorical.py:2955, in countplot(data, x, y, hue, order, hue_order, orient, color, palette, saturation, width, dodge, ax, **kwargs) 2952 if ax is None: 2953 ax = plt.gca() -> 2955 plotter.plot(ax, kwargs) 2956 return ax File ~\anaconda3\Lib\site-packages\seaborn\categorical.py:1587, in _BarPlotter.plot(self, ax, bar_kws) 1585 """Make the plot.""" 1586 self.draw_bars(ax, bar_kws) -> 1587 self.annotate_axes(ax) 1588 if self.orient == "h": 1589 ax.invert_yaxis() File ~\anaconda3\Lib\site-packages\seaborn\categorical.py:767, in _CategoricalPlotter.annotate_axes(self, ax) 764 ax.set_ylim(-.5, len(self.plot_data) - .5, auto=None) 766 if self.hue_names is not None: --> 767 ax.legend(loc="best", title=self.hue_title) File ~\anaconda3\Lib\site-packages\matplotlib\axes\_axes.py:322, in Axes.legend(self, *args, **kwargs) 204 @_docstring.dedent_interpd 205 def legend(self, *args, **kwargs): 206 """ 207 Place a legend on the Axes. 208 (...) 320 .. plot:: gallery/text_labels_and_annotations/legend.py 321 """ --> 322 handles, labels, kwargs = mlegend._parse_legend_args([self], *args, **kwargs) 323 self.legend_ = mlegend.Legend(self, handles, labels, **kwargs) 324 self.legend_._remove_method = self._remove_legend File ~\anaconda3\Lib\site-packages\matplotlib\legend.py:1361, in _parse_legend_args(axs, handles, labels, *args, **kwargs) 1357 handles = [handle for handle, label 1358 in zip(_get_legend_handles(axs, handlers), labels)] 1360 elif len(args) == 0: # 0 args: automatically detect labels and handles. -> 1361 handles, labels = _get_legend_handles_labels(axs, handlers) 1362 if not handles: 1363 log.warning( 1364 "No artists with labels found to put in legend. Note that " 1365 "artists whose label start with an underscore are ignored " 1366 "when legend() is called with no argument.") File ~\anaconda3\Lib\site-packages\matplotlib\legend.py:1291, in _get_legend_handles_labels(axs, legend_handler_map) 1289 for handle in _get_legend_handles(axs, legend_handler_map): 1290 label = handle.get_label() -> 1291 if label and not label.startswith('_'): 1292 handles.append(handle) 1293 labels.append(label) AttributeError: 'numpy.int64' object has no attribute 'startswith'
Observation
Out Hypothesis was that self employed people defaults more and that was true as 23%appx time self employed people dont pay there loans or defaults which is 3% more then salaried people
biv_cat_cat_analysis(train,target_='loan_default',cat='manufacturer_id')
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[96], line 1 ----> 1 biv_cat_cat_analysis(train,target_='loan_default',cat='manufacturer_id') Cell In[94], line 25, in biv_cat_cat_analysis(data, target_, cat, sort) 22 else: 23 sig = False ---> 25 sns.countplot(x=cat, hue=target_, data=data) 26 plt.title(f"p-value = {round(p,8)}\n difference significant? = {sig}\n") 28 df = data.groupby(cat)[target_].value_counts(normalize=True).unstack() File ~\anaconda3\Lib\site-packages\seaborn\categorical.py:2955, in countplot(data, x, y, hue, order, hue_order, orient, color, palette, saturation, width, dodge, ax, **kwargs) 2952 if ax is None: 2953 ax = plt.gca() -> 2955 plotter.plot(ax, kwargs) 2956 return ax File ~\anaconda3\Lib\site-packages\seaborn\categorical.py:1587, in _BarPlotter.plot(self, ax, bar_kws) 1585 """Make the plot.""" 1586 self.draw_bars(ax, bar_kws) -> 1587 self.annotate_axes(ax) 1588 if self.orient == "h": 1589 ax.invert_yaxis() File ~\anaconda3\Lib\site-packages\seaborn\categorical.py:767, in _CategoricalPlotter.annotate_axes(self, ax) 764 ax.set_ylim(-.5, len(self.plot_data) - .5, auto=None) 766 if self.hue_names is not None: --> 767 ax.legend(loc="best", title=self.hue_title) File ~\anaconda3\Lib\site-packages\matplotlib\axes\_axes.py:322, in Axes.legend(self, *args, **kwargs) 204 @_docstring.dedent_interpd 205 def legend(self, *args, **kwargs): 206 """ 207 Place a legend on the Axes. 208 (...) 320 .. plot:: gallery/text_labels_and_annotations/legend.py 321 """ --> 322 handles, labels, kwargs = mlegend._parse_legend_args([self], *args, **kwargs) 323 self.legend_ = mlegend.Legend(self, handles, labels, **kwargs) 324 self.legend_._remove_method = self._remove_legend File ~\anaconda3\Lib\site-packages\matplotlib\legend.py:1361, in _parse_legend_args(axs, handles, labels, *args, **kwargs) 1357 handles = [handle for handle, label 1358 in zip(_get_legend_handles(axs, handlers), labels)] 1360 elif len(args) == 0: # 0 args: automatically detect labels and handles. -> 1361 handles, labels = _get_legend_handles_labels(axs, handlers) 1362 if not handles: 1363 log.warning( 1364 "No artists with labels found to put in legend. Note that " 1365 "artists whose label start with an underscore are ignored " 1366 "when legend() is called with no argument.") File ~\anaconda3\Lib\site-packages\matplotlib\legend.py:1291, in _get_legend_handles_labels(axs, legend_handler_map) 1289 for handle in _get_legend_handles(axs, legend_handler_map): 1290 label = handle.get_label() -> 1291 if label and not label.startswith('_'): 1292 handles.append(handle) 1293 labels.append(label) AttributeError: 'numpy.int64' object has no attribute 'startswith'
Observation
Major Manufacturer are with id-86&45
NaN values can be seen as they are very small manufacturer
Conclusion
biv_cat_cat_analysis(train,target_='loan_default',cat='State_ID')
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) Cell In[97], line 1 ----> 1 biv_cat_cat_analysis(train,target_='loan_default',cat='State_ID') Cell In[94], line 25, in biv_cat_cat_analysis(data, target_, cat, sort) 22 else: 23 sig = False ---> 25 sns.countplot(x=cat, hue=target_, data=data) 26 plt.title(f"p-value = {round(p,8)}\n difference significant? = {sig}\n") 28 df = data.groupby(cat)[target_].value_counts(normalize=True).unstack() File ~\anaconda3\Lib\site-packages\seaborn\categorical.py:2955, in countplot(data, x, y, hue, order, hue_order, orient, color, palette, saturation, width, dodge, ax, **kwargs) 2952 if ax is None: 2953 ax = plt.gca() -> 2955 plotter.plot(ax, kwargs) 2956 return ax File ~\anaconda3\Lib\site-packages\seaborn\categorical.py:1587, in _BarPlotter.plot(self, ax, bar_kws) 1585 """Make the plot.""" 1586 self.draw_bars(ax, bar_kws) -> 1587 self.annotate_axes(ax) 1588 if self.orient == "h": 1589 ax.invert_yaxis() File ~\anaconda3\Lib\site-packages\seaborn\categorical.py:767, in _CategoricalPlotter.annotate_axes(self, ax) 764 ax.set_ylim(-.5, len(self.plot_data) - .5, auto=None) 766 if self.hue_names is not None: --> 767 ax.legend(loc="best", title=self.hue_title) File ~\anaconda3\Lib\site-packages\matplotlib\axes\_axes.py:322, in Axes.legend(self, *args, **kwargs) 204 @_docstring.dedent_interpd 205 def legend(self, *args, **kwargs): 206 """ 207 Place a legend on the Axes. 208 (...) 320 .. plot:: gallery/text_labels_and_annotations/legend.py 321 """ --> 322 handles, labels, kwargs = mlegend._parse_legend_args([self], *args, **kwargs) 323 self.legend_ = mlegend.Legend(self, handles, labels, **kwargs) 324 self.legend_._remove_method = self._remove_legend File ~\anaconda3\Lib\site-packages\matplotlib\legend.py:1361, in _parse_legend_args(axs, handles, labels, *args, **kwargs) 1357 handles = [handle for handle, label 1358 in zip(_get_legend_handles(axs, handlers), labels)] 1360 elif len(args) == 0: # 0 args: automatically detect labels and handles. -> 1361 handles, labels = _get_legend_handles_labels(axs, handlers) 1362 if not handles: 1363 log.warning( 1364 "No artists with labels found to put in legend. Note that " 1365 "artists whose label start with an underscore are ignored " 1366 "when legend() is called with no argument.") File ~\anaconda3\Lib\site-packages\matplotlib\legend.py:1291, in _get_legend_handles_labels(axs, legend_handler_map) 1289 for handle in _get_legend_handles(axs, legend_handler_map): 1290 label = handle.get_label() -> 1291 if label and not label.startswith('_'): 1292 handles.append(handle) 1293 labels.append(label) AttributeError: 'numpy.int64' object has no attribute 'startswith'
from numpy import sqrt, abs, round
from scipy.stats import t as t_dist
from scipy.stats import norm
#list of all numerical_features
print(numerical_features)
['disbursed_amount', 'asset_cost', 'ltv', 'Current_pincode_ID', 'PERFORM_CNS.SCORE', 'PRI.NO.OF.ACCTS', 'PRI.ACTIVE.ACCTS', 'PRI.OVERDUE.ACCTS', 'PRI.CURRENT.BALANCE', 'PRI.SANCTIONED.AMOUNT', 'PRI.DISBURSED.AMOUNT', 'SEC.NO.OF.ACCTS', 'SEC.ACTIVE.ACCTS', 'SEC.OVERDUE.ACCTS', 'SEC.CURRENT.BALANCE', 'SEC.SANCTIONED.AMOUNT', 'SEC.DISBURSED.AMOUNT', 'PRIMARY.INSTAL.AMT', 'SEC.INSTAL.AMT', 'NEW.ACCTS.IN.LAST.SIX.MONTHS', 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS', 'NO.OF_INQUIRIES']
def two_sample_T(X1, X2, sd, sd1, n1, n2):
ovr_sd = sqrt(sd**2/n1 + sd1**2/n2)
t = (X1 - X2)/ovr_sd
df = n1+n2-2
pval = 2*(1 - t_dist.cdf(abs(t),df))
return pval
def two_sample_Z(X1, X2, sigma, sigma1, n1, n2):
ovr_sigma = sqrt(sigma**2/n1 + sigma1**2/n2)
z = (X1 - X2)/ovr_sigma
pval = 2*(1 - norm.cdf(abs(z)))
return pval
# Bivariate Cont Cat Exploration Function
def bi_cat_con_analysis(data, continious_var, catategorical_var, category):
#creating 2 samples
x1 = data[continious_var][data[catategorical_var]==category][:]
x2 = data[continious_var][~(data[catategorical_var]==category)][:]
n1, n2 = x1.shape[0], x2.shape[0]
m1, m2 = x1.mean(), x2.mean()
std1, std2 = x1.std(), x2.std()
#p-values calculation
t_p_val = two_sample_T(m1, m2, std1, std2, n1, n2)
z_p_val = two_sample_Z(m1, m2, std1, std2, n1, n2)
#pivot table
table = pd.pivot_table(data=data, values=continious_var, columns=catategorical_var, aggfunc = np.mean)
#plotting
plt.figure(figsize = (10,4),dpi=100)
plt.subplot(1,2,1)
sns.barplot(x=[str(category),'not {category}'], y=[m1, m2])
plt.ylabel(f'mean {continious_var}')
plt.xlabel(catategorical_var)
plt.title(f't-test p-value = {t_p_val}\nz-test p-value = {z_p_val}\n{table}')
# boxplot
plt.subplot(1,2,2)
sns.boxplot(x=catategorical_var, y=continious_var, data=data)
plt.title('Boxplot of Continuous Variable by Categorical Variable')
bi_cat_con_analysis(train, 'disbursed_amount', 'loan_default', 1)
Observation
p-value for both test is 0 i.e loan_default have a significan relationship with disbursed_amount
There are some outliers which are effecting out visualization lets remove them first
average disbursed_amount for 0-53826 and 1-5670 ==> but it is not vissible in 2nd-plot which is due to outliers#removing outliers
upper_limit = np.percentile(train.disbursed_amount.values, 99) #after 99%tile
lower_limit = np.percentile(train.disbursed_amount.values, 1) #after 1%tile
#cliping
train['disbursed_amount'][train['disbursed_amount'] > upper_limit] = upper_limit #if value is grater then upper_limit replace it with upper_limit
train['disbursed_amount'][train['disbursed_amount'] < lower_limit] = lower_limit
bi_cat_con_analysis(train, 'disbursed_amount', 'loan_default',1)
Observation
see disbursed_amount of defaulter-1 is higher then disbursed_amount of not-default-0
Now Ploting Distribution Plot for Defaulter and Non-Defaulter
#alternate way of visualising disbursed_amount distribution for defaulters & non defaulters
plt.figure(figsize = (10,4),dpi=100)
sns.distplot(train["disbursed_amount"][train['loan_default'] == 1] , color='skyblue', label='Defaulters')
sns.distplot(train["disbursed_amount"][train['loan_default'] == 0] , color='orange', label='Not Defaulters')
plt.legend()
plt.show()
bi_cat_con_analysis(train, 'ltv', 'loan_default', 1)
Observation
ltv is higher for defaults-1, see average itv values and in 2nd-plot
It mean if user have taken loan of rs10 and takes loan amount say rs9 with rs1 downpament then this user is more prone to default
#alternate way of visualising ltv distribution for defaulters & non defaulters
plt.figure(figsize=(10,4),dpi=100)
sns.distplot(train["ltv"][train['loan_default'] == 1] , color='skyblue', label='Defaulters')
sns.distplot(train["ltv"][train['loan_default'] == 0] , color='orange', label='Not Defaulters')
plt.legend()
plt.show()
bi_cat_con_analysis(train, 'asset_cost', 'loan_default', 1)
#removing outliers
upper_limit = np.percentile(train.asset_cost.values, 99) #after 99%tile
lower_limit = np.percentile(train.asset_cost.values, 1) #after 1%tile
#cliping
train['asset_cost'][train['asset_cost'] > upper_limit] = upper_limit #if value is grater then upper_limit replace it with upper_limit
train['asset_cost'][train['asset_cost'] < lower_limit] = lower_limit
bi_cat_con_analysis(train, 'asset_cost', 'loan_default', 1)
Observation
asset_cost is not that effective feature
#alternate way of visualising asset_cost distribution for defaulters & non defaulters
plt.figure(figsize=(10,4),dpi=100)
sns.distplot(train['asset_cost'][train['loan_default'] == 1] , color='skyblue', label='Defaulters')
sns.distplot(train['asset_cost'][train['loan_default'] == 0] , color='orange', label='Not Defaulters')
plt.legend()
plt.show()
bi_cat_con_analysis(train, 'PERFORM_CNS.SCORE', 'loan_default', 1)
Observation
By our previous analysis we know most of these values are 0, lets plot this after removing entries having 0 in them
#performance_cns_score vs loan default with 0s removed
bi_cat_con_analysis(train[train['PERFORM_CNS.SCORE'] > 0], 'PERFORM_CNS.SCORE', 'loan_default', 1)
Observation
For non defaulter we have high value of PERFORM_CNS.SCORE i.e 0-591 appx
#alternate way of visualising PERFORM_CNS.SCORE distribution for defaulters & non defaulters
plt.figure(figsize=(10,4),dpi=100)
sns.distplot(train['PERFORM_CNS.SCORE'][(train['loan_default'] == 1) & (train['PERFORM_CNS.SCORE'] > 0)] , color='skyblue', label='Defaulters')
sns.distplot(train['PERFORM_CNS.SCORE'][(train['loan_default'] == 0) & (train['PERFORM_CNS.SCORE'] > 0)] , color='orange', label='Not-Defaulters')
plt.legend()
plt.show()
#age_in_years vs loan default with 0s removed
bi_cat_con_analysis(train, 'age_in_years', 'loan_default', 1)
#removing outliers
upper_limit = np.percentile(train.age_in_years.values, 100) #after 100%tile
lower_limit = np.percentile(train.age_in_years.values, 9) #after 9%tile
#cliping
train['age_in_years'][train['age_in_years'] > upper_limit] = upper_limit #if value is grater then upper_limit replace it with upper_limit
train['age_in_years'][train['age_in_years'] < lower_limit] = lower_limit
#age_in_years vs loan default with 0s removed
bi_cat_con_analysis(train, 'age_in_years', 'loan_default', 1)
#alternate way of visualising age_in_years distribution for defaulters & non defaulters
plt.figure(figsize=(10,4),dpi=100)
sns.distplot(train['age_in_years'][train['loan_default'] == 1] , color='skyblue', label='Defaulters')
sns.distplot(train['age_in_years'][train['loan_default'] == 0] , color='orange', label='Not Defaulters')
plt.legend()
plt.show()
# of primary accounts vs default rate
# function removing outliers from primary account
def kick_outlieres(data,feature):
ulimit = np.percentile(train.credit_history_in_months.values, 99)
data[feature][data[feature] > ulimit] = ulimit
return data
df = kick_outlieres(train,'PRI.NO.OF.ACCTS')
# Observing Default rate vs primary number of accounts
biv_cat_cat_analysis(train, 'loan_default', 'PRI.NO.OF.ACCTS' )
Observation
If PRI.NO.OF.ACCTS are more then 1 then we can say customer have take lone previously and what we can see is if person have taken loan previously, he/she is less prone to defaulting
Binning can be used to see these graphs more clearly but still we can get intution
# Similarly for overdue accounts vs loan default rate
biv_cat_cat_analysis(train, 'loan_default', 'PRI.OVERDUE.ACCTS' )
Observation
As PRI.OVERDUE.ACCTS are increasing defaults are incresing to
# Delinquent accounts vs loan default rate
biv_cat_cat_analysis(train, 'loan_default', 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS' )
# Overdue accounts vs default rate
biv_cat_cat_analysis(train,'loan_default','no_overdue_prim_loan_flag')
Default rate[loan_default] vs ID features
#default rate vs binned pin code
biv_cat_cat_analysis(train,'loan_default','Current_pincode_ID')
Observation
We require bining for clear views lets do that first
#binning pin code
train['Current_pincode_ID_binned'] = pd.cut(train['Current_pincode_ID'], bins=25)
#default rate vs binned pin code after binning
biv_cat_cat_analysis(train,'loan_default','Current_pincode_ID_binned')
Observatoin
No clear patern are seen like more default in some reason kinde of stuff, as we were finding in previous analysis and we came to this conclusion that pin code have some patern in them, it is not the case
#default rate vs branch id
biv_cat_cat_analysis(train,'loan_default','branch_id')
Observation
It seams branch_id are all randome values
#creating bins on basis of [ count of defaulter customers for each supplier ]
df = train[['supplier_id','loan_default']].groupby(['supplier_id'])['loan_default'].count().reset_index(name='loan_default_count')\
.sort_values(['loan_default_count'],ascending=False)
train = train.merge(df, on='supplier_id', how='left')
#binning
train['bin_count_supp'] = pd.cut(train['loan_default_count'], bins=20)
#supplier count frequency vs default rate
biv_cat_cat_analysis(train,'loan_default','bin_count_supp')
Observation
Big supliers means more customers then these supliers have more defaulting customer
We counted defaulting customer for each supliers and based on ount we have made bins
df = train[['Employee_code_ID','loan_default']].groupby(['Employee_code_ID'])['loan_default'].count().reset_index(name='loan_default_cnt').sort_values(['loan_default_cnt'],ascending=False)
train = train.merge(df, on='Employee_code_ID', how='left')
#binning
train['bin_count_Employee_code_ID'] = pd.cut(train['loan_default_cnt'], bins=5)
#supplier count frequency vs default rate
biv_cat_cat_analysis(train,'loan_default','bin_count_Employee_code_ID')
Observation
Very few employ handles more then 500 customer but there default rate is higher , maybe due to work load as they are handking more customers
#patching outliers
ulimit = np.percentile(train.credit_history_in_months.values, 99)
llimit = np.percentile(train.credit_history_in_months.values, 1)
train['credit_history_in_months'][train.credit_history_in_months > ulimit] = ulimit
train['credit_history_in_months'][train.credit_history_in_months < llimit] = llimit
#alternate way of visualising age_in_years distribution for defaulters & non defaulters
plt.figure(figsize=(10,4),dpi=100)
sns.distplot(train['credit_history_in_months'][train['loan_default'] == 1] , color='skyblue', label='Defaulters')
sns.distplot(train['credit_history_in_months'][train['loan_default'] == 0] , color='orange', label='Not Defaulters')
plt.legend()
plt.show()
Now we can say that least part of data exploration is done by us still there are many factord which can be explored
But now I am convinced to make a model out of this data
#loading data
path = '/kaggle/input/bank-loan-data/'
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')
train.head()
| UniqueID | disbursed_amount | asset_cost | ltv | branch_id | supplier_id | manufacturer_id | Current_pincode_ID | Date.of.Birth | Employment.Type | DisbursalDate | State_ID | Employee_code_ID | MobileNo_Avl_Flag | Aadhar_flag | PAN_flag | VoterID_flag | Driving_flag | Passport_flag | PERFORM_CNS.SCORE | PERFORM_CNS.SCORE.DESCRIPTION | PRI.NO.OF.ACCTS | PRI.ACTIVE.ACCTS | PRI.OVERDUE.ACCTS | PRI.CURRENT.BALANCE | PRI.SANCTIONED.AMOUNT | PRI.DISBURSED.AMOUNT | SEC.NO.OF.ACCTS | SEC.ACTIVE.ACCTS | SEC.OVERDUE.ACCTS | SEC.CURRENT.BALANCE | SEC.SANCTIONED.AMOUNT | SEC.DISBURSED.AMOUNT | PRIMARY.INSTAL.AMT | SEC.INSTAL.AMT | NEW.ACCTS.IN.LAST.SIX.MONTHS | DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS | AVERAGE.ACCT.AGE | CREDIT.HISTORY.LENGTH | NO.OF_INQUIRIES | loan_default | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 420825 | 50578 | 58400 | 89.55 | 67 | 22807 | 45 | 1441 | 01-01-84 | Salaried | 03-08-18 | 6 | 1998 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | No Bureau History Available | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0yrs 0mon | 0yrs 0mon | 0 | 0 |
| 1 | 537409 | 47145 | 65550 | 73.23 | 67 | 22807 | 45 | 1502 | 31-07-85 | Self employed | 26-09-18 | 6 | 1998 | 1 | 1 | 0 | 0 | 0 | 0 | 598 | I-Medium Risk | 1 | 1 | 1 | 27600 | 50200 | 50200 | 0 | 0 | 0 | 0 | 0 | 0 | 1991 | 0 | 0 | 1 | 1yrs 11mon | 1yrs 11mon | 0 | 1 |
| 2 | 417566 | 53278 | 61360 | 89.63 | 67 | 22807 | 45 | 1497 | 24-08-85 | Self employed | 01-08-18 | 6 | 1998 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | No Bureau History Available | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0yrs 0mon | 0yrs 0mon | 0 | 0 |
| 3 | 624493 | 57513 | 66113 | 88.48 | 67 | 22807 | 45 | 1501 | 30-12-93 | Self employed | 26-10-18 | 6 | 1998 | 1 | 1 | 0 | 0 | 0 | 0 | 305 | L-Very High Risk | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 31 | 0 | 0 | 0 | 0yrs 8mon | 1yrs 3mon | 1 | 1 |
| 4 | 539055 | 52378 | 60300 | 88.39 | 67 | 22807 | 45 | 1495 | 09-12-77 | Self employed | 26-09-18 | 6 | 1998 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | No Bureau History Available | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0yrs 0mon | 0yrs 0mon | 1 | 1 |
test.head()
| UniqueID | disbursed_amount | asset_cost | ltv | branch_id | supplier_id | manufacturer_id | Current_pincode_ID | Date.of.Birth | Employment.Type | DisbursalDate | State_ID | Employee_code_ID | MobileNo_Avl_Flag | Aadhar_flag | PAN_flag | VoterID_flag | Driving_flag | Passport_flag | PERFORM_CNS.SCORE | PERFORM_CNS.SCORE.DESCRIPTION | PRI.NO.OF.ACCTS | PRI.ACTIVE.ACCTS | PRI.OVERDUE.ACCTS | PRI.CURRENT.BALANCE | PRI.SANCTIONED.AMOUNT | PRI.DISBURSED.AMOUNT | SEC.NO.OF.ACCTS | SEC.ACTIVE.ACCTS | SEC.OVERDUE.ACCTS | SEC.CURRENT.BALANCE | SEC.SANCTIONED.AMOUNT | SEC.DISBURSED.AMOUNT | PRIMARY.INSTAL.AMT | SEC.INSTAL.AMT | NEW.ACCTS.IN.LAST.SIX.MONTHS | DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS | AVERAGE.ACCT.AGE | CREDIT.HISTORY.LENGTH | NO.OF_INQUIRIES | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 655269 | 53478 | 63558 | 86.54 | 67 | 22807 | 45 | 1497 | 01-01-74 | Salaried | 03-11-18 | 6 | 1998 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | No Bureau History Available | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0yrs 0mon | 0yrs 0mon | 0 |
| 1 | 723482 | 55513 | 63163 | 89.45 | 67 | 22807 | 45 | 1497 | 20-05-85 | Self employed | 20-11-18 | 6 | 1998 | 1 | 1 | 0 | 0 | 0 | 0 | 749 | C-Very Low Risk | 2 | 1 | 0 | 43898 | 48780 | 48780 | 0 | 0 | 0 | 0 | 0 | 0 | 5605 | 0 | 1 | 0 | 0yrs 8mon | 1yrs 0mon | 1 |
| 2 | 758529 | 65282 | 84320 | 79.93 | 78 | 23135 | 86 | 2071 | 14-10-95 | Salaried | 29-11-18 | 4 | 1646 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | No Bureau History Available | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0yrs 0mon | 0yrs 0mon | 0 |
| 3 | 763449 | 46905 | 63896 | 76.58 | 78 | 17014 | 45 | 2070 | 01-06-73 | Self employed | 29-11-18 | 4 | 1646 | 1 | 1 | 0 | 0 | 0 | 0 | 14 | Not Scored: Only a Guarantor | 1 | 1 | 1 | 132480 | 255000 | 255000 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2yrs 5mon | 2yrs 5mon | 0 |
| 4 | 708663 | 51428 | 63896 | 86.08 | 78 | 17014 | 45 | 2069 | 01-06-72 | Salaried | 17-11-18 | 4 | 1646 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | No Bureau History Available | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0yrs 0mon | 0yrs 0mon | 0 |
train.shape,test.shape
((233154, 41), (112392, 40))
#stacking train over test
data = train.append(test,ignore_index=True)
data.shape
(345546, 41)
#null check
data.isnull().sum()
UniqueID 0 disbursed_amount 0 asset_cost 0 ltv 0 branch_id 0 supplier_id 0 manufacturer_id 0 Current_pincode_ID 0 Date.of.Birth 0 Employment.Type 11104 DisbursalDate 0 State_ID 0 Employee_code_ID 0 MobileNo_Avl_Flag 0 Aadhar_flag 0 PAN_flag 0 VoterID_flag 0 Driving_flag 0 Passport_flag 0 PERFORM_CNS.SCORE 0 PERFORM_CNS.SCORE.DESCRIPTION 0 PRI.NO.OF.ACCTS 0 PRI.ACTIVE.ACCTS 0 PRI.OVERDUE.ACCTS 0 PRI.CURRENT.BALANCE 0 PRI.SANCTIONED.AMOUNT 0 PRI.DISBURSED.AMOUNT 0 SEC.NO.OF.ACCTS 0 SEC.ACTIVE.ACCTS 0 SEC.OVERDUE.ACCTS 0 SEC.CURRENT.BALANCE 0 SEC.SANCTIONED.AMOUNT 0 SEC.DISBURSED.AMOUNT 0 PRIMARY.INSTAL.AMT 0 SEC.INSTAL.AMT 0 NEW.ACCTS.IN.LAST.SIX.MONTHS 0 DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS 0 AVERAGE.ACCT.AGE 0 CREDIT.HISTORY.LENGTH 0 NO.OF_INQUIRIES 0 loan_default 112392 dtype: int64
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 345546 entries, 0 to 345545 Data columns (total 41 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 UniqueID 345546 non-null int64 1 disbursed_amount 345546 non-null int64 2 asset_cost 345546 non-null int64 3 ltv 345546 non-null float64 4 branch_id 345546 non-null int64 5 supplier_id 345546 non-null int64 6 manufacturer_id 345546 non-null int64 7 Current_pincode_ID 345546 non-null int64 8 Date.of.Birth 345546 non-null object 9 Employment.Type 334442 non-null object 10 DisbursalDate 345546 non-null object 11 State_ID 345546 non-null int64 12 Employee_code_ID 345546 non-null int64 13 MobileNo_Avl_Flag 345546 non-null int64 14 Aadhar_flag 345546 non-null int64 15 PAN_flag 345546 non-null int64 16 VoterID_flag 345546 non-null int64 17 Driving_flag 345546 non-null int64 18 Passport_flag 345546 non-null int64 19 PERFORM_CNS.SCORE 345546 non-null int64 20 PERFORM_CNS.SCORE.DESCRIPTION 345546 non-null object 21 PRI.NO.OF.ACCTS 345546 non-null int64 22 PRI.ACTIVE.ACCTS 345546 non-null int64 23 PRI.OVERDUE.ACCTS 345546 non-null int64 24 PRI.CURRENT.BALANCE 345546 non-null int64 25 PRI.SANCTIONED.AMOUNT 345546 non-null int64 26 PRI.DISBURSED.AMOUNT 345546 non-null int64 27 SEC.NO.OF.ACCTS 345546 non-null int64 28 SEC.ACTIVE.ACCTS 345546 non-null int64 29 SEC.OVERDUE.ACCTS 345546 non-null int64 30 SEC.CURRENT.BALANCE 345546 non-null int64 31 SEC.SANCTIONED.AMOUNT 345546 non-null int64 32 SEC.DISBURSED.AMOUNT 345546 non-null int64 33 PRIMARY.INSTAL.AMT 345546 non-null int64 34 SEC.INSTAL.AMT 345546 non-null int64 35 NEW.ACCTS.IN.LAST.SIX.MONTHS 345546 non-null int64 36 DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS 345546 non-null int64 37 AVERAGE.ACCT.AGE 345546 non-null object 38 CREDIT.HISTORY.LENGTH 345546 non-null object 39 NO.OF_INQUIRIES 345546 non-null int64 40 loan_default 233154 non-null float64 dtypes: float64(2), int64(33), object(6) memory usage: 108.1+ MB
Credit History & Average Account Age
#will apply re on it
print(data['CREDIT.HISTORY.LENGTH'].head(3))
#will apply re on it
print(data['AVERAGE.ACCT.AGE'].head(3))
0 0yrs 0mon 1 1yrs 11mon 2 0yrs 0mon Name: CREDIT.HISTORY.LENGTH, dtype: object 0 0yrs 0mon 1 1yrs 11mon 2 0yrs 0mon Name: AVERAGE.ACCT.AGE, dtype: object
#function to extract months
def map_to_months(data_rows):
row_data = list(map(int, re.findall(r'\d+',data_rows)))
return row_data[0]*12 + row_data[1]
data['credit_history_in_months'] = data['CREDIT.HISTORY.LENGTH'].apply(map_to_months)
data['avg_acct_age_in_months'] = data['AVERAGE.ACCT.AGE'].apply(map_to_months)
#after applying re on it
data['avg_acct_age_in_months'].head(3)
0 0 1 23 2 0 Name: avg_acct_age_in_months, dtype: int64
AGE
#before
print(data['Date.of.Birth'].head(3))
print(data['DisbursalDate'].head(3))
0 01-01-84 1 31-07-85 2 24-08-85 Name: Date.of.Birth, dtype: object 0 03-08-18 1 26-09-18 2 01-08-18 Name: DisbursalDate, dtype: object
data['Date.of.Birth'] = pd.to_datetime(data['Date.of.Birth'], format='%d-%m-%y')
data['DisbursalDate'] = pd.to_datetime(data['DisbursalDate'], format='%d-%m-%y')
future = data['Date.of.Birth'].dt.date > date(year=2019,month=1,day=1)
data.loc[future, 'Date.of.Birth'] -= timedelta(days=365.25*100)
data['age_in_years'] = ((data['DisbursalDate'] - data['Date.of.Birth']).dt.days)/365.25
data['dob_months'] = data['Date.of.Birth'].dt.month
data['dob_days'] = data['Date.of.Birth'].dt.day
data['dob_weeks'] = data['Date.of.Birth'].dt.week
Disbursal Date
data['disb_months'] = data['DisbursalDate'].dt.month
data['disb_days'] = data['DisbursalDate'].dt.day
data['disb_weeks'] = data['DisbursalDate'].dt.week
#after
print(data['Date.of.Birth'].head(3))
print(data['DisbursalDate'].head(3))
0 1984-01-01 1 1985-07-31 2 1985-08-24 Name: Date.of.Birth, dtype: datetime64[ns] 0 2018-08-03 1 2018-09-26 2 2018-08-01 Name: DisbursalDate, dtype: datetime64[ns]
Primary & Secondary Accounts Info
for secondary accounts we have very less information so we are adding primary and secondary accounts
data['ACTIVE.ACCTS'] = data['PRI.ACTIVE.ACCTS'] + data['SEC.ACTIVE.ACCTS']
data['CURRENT.BALANCE'] = data['PRI.CURRENT.BALANCE'] + data['SEC.CURRENT.BALANCE']
data['DISBURSED.AMOUNT'] = data['PRI.DISBURSED.AMOUNT'] + data['SEC.DISBURSED.AMOUNT']
data['NO.OF.ACCTS'] = data['SEC.NO.OF.ACCTS'] + data['PRI.NO.OF.ACCTS']
data['OVERDUE.ACCTS'] = data['PRI.OVERDUE.ACCTS'] + data['SEC.OVERDUE.ACCTS']
data['SANCTIONED.AMOUNT' ] = data['PRI.SANCTIONED.AMOUNT'] + data['SEC.SANCTIONED.AMOUNT']
data['INSTAL.AMT'] = data['PRIMARY.INSTAL.AMT'] + data['SEC.INSTAL.AMT']
data['SANCTION_DISBURSED'] = data['SANCTIONED.AMOUNT'] - data['DISBURSED.AMOUNT']
data['NO_DEACTIVE_ACCOUNTS'] = data['NO.OF.ACCTS'] - data['ACTIVE.ACCTS']
Loan Information
disbursed_amount ==> $$da = \frac{ltv}{100} * asset_cost$$ ==> Commission involved $$= [ \frac{ltv}{100} * asset_cost ] - da$$
#Commission involved
data['extra_finance'] = data['asset_cost'] * (data['ltv']/100) - data['disbursed_amount']
Score Description
data['PERFORM_CNS.SCORE.DESCRIPTION'].unique()
array(['No Bureau History Available', 'I-Medium Risk', 'L-Very High Risk',
'A-Very Low Risk',
'Not Scored: Not Enough Info available on the customer',
'D-Very Low Risk', 'M-Very High Risk', 'B-Very Low Risk',
'C-Very Low Risk', 'E-Low Risk', 'H-Medium Risk', 'F-Low Risk',
'K-High Risk',
'Not Scored: No Activity seen on the customer (Inactive)',
'Not Scored: Sufficient History Not Available',
'Not Scored: No Updates available in last 36 months', 'G-Low Risk',
'J-High Risk', 'Not Scored: Only a Guarantor',
'Not Scored: More than 50 active Accounts found'], dtype=object)
data['PERFORM_CNS.SCORE.DESCRIPTION'].replace({'C-Very Low Risk':'Very Low Risk','A-Very Low Risk':'Very Low Risk',
'D-Very Low Risk':'Very Low Risk','B-Very Low Risk':'Very Low Risk',
'M-Very High Risk':'Very High Risk','L-Very High Risk':'Very High Risk',
'F-Low Risk':'Low Risk','E-Low Risk':'Low Risk','H-Medium Risk':'Medium Risk',
'I-Medium Risk':'Medium Risk','J-High Risk':'High Risk',
'K-High Risk':'High Risk'},inplace=True)
data['Not_Scored'] = np.where(data['PERFORM_CNS.SCORE.DESCRIPTION'].str.contains('Not Scored'),1,0)
data['Very_Low'] = np.where(data['PERFORM_CNS.SCORE.DESCRIPTION'].str.contains('Very Low'),1,0)
data['Very_High'] = np.where(data['PERFORM_CNS.SCORE.DESCRIPTION'].str.contains('Very High'),1,0)
data['No_History'] = np.where(data['PERFORM_CNS.SCORE.DESCRIPTION'].str.contains('No Bureau'),1,0)
employment type
data['Employment.Type'].unique()
array(['Salaried', 'Self employed', nan], dtype=object)
data['Employment.Type'] = data['Employment.Type'].apply(lambda x: 0 if x == "Salaried" else 1)
data['Employment.Type'].unique()
array([0, 1])
Identity Proof Flags
I have added all of them collectvly they will make more sense
for i in ['Aadhar_flag','Driving_flag','PAN_flag','Passport_flag','VoterID_flag']:
data[i] = data[i].astype(np.object)
data['Total_Flag'] = data['Driving_flag'] + data['Aadhar_flag'] + data['PAN_flag'] + data['Passport_flag'] + data['VoterID_flag']
NOTE ==> for tree based model we can keep all these id's but for Logistic Regression it is hard as loat of features will effect it's performance
#droping some of id's and some of other features which have be enginered
drop_cols = ['Date.of.Birth','UniqueID','MobileNo_Avl_Flag',
'PERFORM_CNS.SCORE.DESCRIPTION','AVERAGE.ACCT.AGE','CREDIT.HISTORY.LENGTH',
'Aadhar_flag', 'Driving_flag', 'PAN_flag', 'Passport_flag', 'VoterID_flag',
'Employee_code_ID','branch_id','State_ID','manufacturer_id','supplier_id']
data.drop(drop_cols,axis=1,inplace=True)
Re-Split
No again splitting train and test data from main data
Logick is where we dont have loan_default information that is test data
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 345546 entries, 0 to 345545 Data columns (total 49 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 disbursed_amount 345546 non-null int64 1 asset_cost 345546 non-null int64 2 ltv 345546 non-null float64 3 Current_pincode_ID 345546 non-null int64 4 Employment.Type 345546 non-null int64 5 DisbursalDate 345546 non-null datetime64[ns] 6 PERFORM_CNS.SCORE 345546 non-null int64 7 PRI.NO.OF.ACCTS 345546 non-null int64 8 PRI.ACTIVE.ACCTS 345546 non-null int64 9 PRI.OVERDUE.ACCTS 345546 non-null int64 10 PRI.CURRENT.BALANCE 345546 non-null int64 11 PRI.SANCTIONED.AMOUNT 345546 non-null int64 12 PRI.DISBURSED.AMOUNT 345546 non-null int64 13 SEC.NO.OF.ACCTS 345546 non-null int64 14 SEC.ACTIVE.ACCTS 345546 non-null int64 15 SEC.OVERDUE.ACCTS 345546 non-null int64 16 SEC.CURRENT.BALANCE 345546 non-null int64 17 SEC.SANCTIONED.AMOUNT 345546 non-null int64 18 SEC.DISBURSED.AMOUNT 345546 non-null int64 19 PRIMARY.INSTAL.AMT 345546 non-null int64 20 SEC.INSTAL.AMT 345546 non-null int64 21 NEW.ACCTS.IN.LAST.SIX.MONTHS 345546 non-null int64 22 DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS 345546 non-null int64 23 NO.OF_INQUIRIES 345546 non-null int64 24 loan_default 233154 non-null float64 25 credit_history_in_months 345546 non-null int64 26 avg_acct_age_in_months 345546 non-null int64 27 age_in_years 345546 non-null float64 28 dob_months 345546 non-null int64 29 dob_days 345546 non-null int64 30 dob_weeks 345546 non-null int64 31 disb_months 345546 non-null int64 32 disb_days 345546 non-null int64 33 disb_weeks 345546 non-null int64 34 ACTIVE.ACCTS 345546 non-null int64 35 CURRENT.BALANCE 345546 non-null int64 36 DISBURSED.AMOUNT 345546 non-null int64 37 NO.OF.ACCTS 345546 non-null int64 38 OVERDUE.ACCTS 345546 non-null int64 39 SANCTIONED.AMOUNT 345546 non-null int64 40 INSTAL.AMT 345546 non-null int64 41 SANCTION_DISBURSED 345546 non-null int64 42 NO_DEACTIVE_ACCOUNTS 345546 non-null int64 43 extra_finance 345546 non-null float64 44 Not_Scored 345546 non-null int64 45 Very_Low 345546 non-null int64 46 Very_High 345546 non-null int64 47 No_History 345546 non-null int64 48 Total_Flag 345546 non-null object dtypes: datetime64[ns](1), float64(4), int64(43), object(1) memory usage: 129.2+ MB
data['Employment.Type']
0 0
1 1
2 1
3 1
4 1
..
345541 1
345542 1
345543 1
345544 1
345545 1
Name: Employment.Type, Length: 345546, dtype: int64
train = data[data['loan_default'].isnull() != True]
test = data[data['loan_default'].isnull() == True]
train.DisbursalDate.describe()
count 233154 unique 84 top 2018-10-31 00:00:00 freq 8826 first 2018-08-01 00:00:00 last 2018-10-31 00:00:00 Name: DisbursalDate, dtype: object
Observation
All dates are in past for train data
test.DisbursalDate.describe()
count 112392 unique 27 top 2018-11-15 00:00:00 freq 7803 first 2018-11-03 00:00:00 last 2018-11-30 00:00:00 Name: DisbursalDate, dtype: object
Observation
All dates are in future for test data in compare to train data
We have to make 1 validation set to check our model performance, but question is:
we can use randome split or not??? randome_split as that will not preserve structure of dataOne way to overcome to it is
import datetime
x_train_not_final = train[train.DisbursalDate < datetime.datetime(2018,10,1)]
x_valid_not_final = train[train.DisbursalDate >= datetime.datetime(2018,10,1)] #last one month for validation
based on DisbursalDate we splited out data set into train and validation, now droping it as we have extracted features out of it before
x_train = x_train_not_final.drop(['loan_default','DisbursalDate'],axis=1)
y_train = x_train_not_final.loan_default
x_valid = x_valid_not_final.drop(['loan_default','DisbursalDate'],axis=1)
y_valid = x_valid_not_final.loan_default
x_train.shape,y_train.shape,x_valid.shape,y_valid.shape
((134790, 47), (134790,), (98364, 47), (98364,))
from sklearn.linear_model import LogisticRegression as lr
from sklearn.tree import DecisionTreeClassifier as dtc
from sklearn.metrics import confusion_matrix,accuracy_score,recall_score,roc_auc_score,classification_report,roc_auc_score,roc_curve,auc
#modeling function
def model(algorithm, dtrain_X, dtrain_Y, dtest_X, dtest_Y, cols=None):
algorithm.fit(dtrain_X[cols],dtrain_Y)
predictions = algorithm.predict(dtest_X[cols])
print(algorithm)
print(f"Accuracy score : {accuracy_score(predictions,dtest_Y)}")
print(f"Recall score : {recall_score(predictions,dtest_Y)}")
print(f"Classification report :\n{classification_report(predictions,dtest_Y)}")
fig = plt.figure(figsize=(10,8))
ax = fig.add_subplot(111)
prediction_probabilities = algorithm.predict_proba(dtest_X[cols])[:,1]
fpr , tpr , thresholds = roc_curve(dtest_Y,prediction_probabilities)
ax.plot(fpr,tpr,label = ["Area under curve : ",auc(fpr,tpr)],linewidth=2,linestyle="dotted")
ax.plot([0,1],[0,1],linewidth=2,linestyle="dashed")
plt.legend(loc="best")
plt.title("ROC-CURVE & AREA UNDER CURVE")
dtc = dtc(max_depth=7)
#using modelling function
model(dtc,x_train,y_train,x_valid,y_valid,x_train.columns)
DecisionTreeClassifier(max_depth=7)
Accuracy score : 0.7639380261071124
Recall score : 0.20394736842105263
Classification report :
precision recall f1-score support
0.0 1.00 0.76 0.87 98212
1.0 0.00 0.20 0.00 152
accuracy 0.76 98364
macro avg 0.50 0.48 0.43 98364
weighted avg 1.00 0.76 0.86 98364
pred = dtc.predict_proba(test.drop(['DisbursalDate','loan_default'], axis =1))
pred
array([[0.75066013, 0.24933987],
[0.7805326 , 0.2194674 ],
[0.71208142, 0.28791858],
...,
[0.87126237, 0.12873763],
[0.72938689, 0.27061311],
[0.88752247, 0.11247753]])
pred[:,-1]
array([0.24933987, 0.2194674 , 0.28791858, ..., 0.12873763, 0.27061311,
0.11247753])
#reading sample sub files which have randome loan_default values
sample_submission = pd.read_csv(path + 'sample_submission.csv')
#replacing loan_default with predictions
sample_submission['loan_default'] = pred[:,-1]
#writing prediciton in new file
sample_submission.to_csv('pred_dtc_new.csv', index=False)
sample_submission.head()
| UniqueID | loan_default | |
|---|---|---|
| 0 | 655269 | 0.249340 |
| 1 | 723482 | 0.219467 |
| 2 | 758529 | 0.287919 |
| 3 | 763449 | 0.194112 |
| 4 | 708663 | 0.287919 |
Now we can plot this dt ans see if it alined with out hypothesis